library(tidyverse)
library(plotly)
library(sf)
library(mapview)
library(tigris)
library(censusapi)
library(leaflet)
library(lehdr)
options(
tigris_class = "sf",
tigris_use_cache = TRUE
)
Sys.setenv(CENSUS_KEY="10dcd73d7c043e91bac9fb8d3989cbff54b08790")
Here I obtain various demographic data, including income (percent below 50% and 80% of area median income), vehicle ownership, age, English language ability, and occupants per room.
# obtain the saved census data
setwd("~/Documents/2020 Spring Quarter/CEE 218Z")
acs_vars = readRDS("censusData2018_acs_acs5.rds")
setwd("~/Documents/2020 Spring Quarter/CEE 218Z/covid19")
# load in income data - code adapted from other students
sj_median_income_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "B19013_001E"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
rename(
Median_Income = B19013_001E
) %>%
filter(!is.na(Median_Income)) %>%
left_join(sj_blockgroups, by = c("blockgroup" = "GEOID")) %>% #this code gives each blockgroup a district designation
filter(
!is.na(DISTRICTS)
) %>%
# this code joins our census data with the social distancing data, processed as shown below
left_join(sj_socialdistancing %>%
filter(weekend == F) %>%
filter(date > shelter_start) %>%
group_by(origin_census_block_group) %>%
summarize(
completely_home_device_count = sum(completely_home_device_count),
device_count = sum(device_count)) %>%
mutate(`% Completely at Home` = (completely_home_device_count/device_count*100) %>% round(1),
`% not completely at home` = (100 - `% Completely at Home`)),
by = c("blockgroup" = "origin_census_block_group")
) %>%
filter(
!is.na(device_count)
) %>%
left_join(sj_pre_sd_at_home_average %>% dplyr::select(origin_census_block_group, `% Completely at Home Pre Shelter`, `% not completely at home pre shelter`), by = c("blockgroup" = "origin_census_block_group"))
sj_ami_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B19001)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
dplyr::select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
group_by(blockgroup) %>%
summarize(
Total = B19001_001E,
`Under 75,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E),
#sum(lapply(2:12, function(x) as.name(paste0("B19001_00",x,"E"))))
`Under 100,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E),
`Under 125,000` = sum(B19001_002E, B19001_003E, B19001_004E, B19001_005E, B19001_006E, B19001_007E, B19001_008E, B19001_009E, B19001_010E, B19001_011E, B19001_012E, B19001_013E, B19001_014E)
) %>%
mutate(
`% under 75,000` = `Under 75,000` / Total * 100,
`% over 75,000` = (100 - `% under 75,000`),
`% under 100,000` = `Under 100,000` / Total * 100,
`% over 100,000` = (100 - `% under 100,000`),
`% under 125,000` = `Under 125,000` / Total * 100,
`% over 125,000` = (100 - `% under 125,000`),
) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)
) %>%
filter(!is.na(device_count))
# loading in language data - code adapted from other students
sj_lang_by_block <-
getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B16004)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(
key = "variable",
value = "estimate",
- blockgroup
) %>%
left_join(acs_vars, by = c("variable" = "name")) %>%
mutate(
tier = substr(label,lapply(label, function(x) max(unlist(gregexpr('!!',x)))+2),nchar(label))
) %>%
filter(tier %in% c('Speak English "not well"',
'Speak English "not at all"',
'Total', 'Speak Spanish',
'Speak Asian and Pacific Island languages')) %>%
group_by(blockgroup, tier) %>%
summarise(
estimate1 = sum(estimate)
) %>%
spread(
key = "tier",
value = "estimate1"
) %>%
mutate(
`% speaking english < well` = (`Speak English "not well"` + `Speak English "not at all"`) / Total * 100,
`% speaking english > well` = (100 - `% speaking english < well`),
`% speaking spanish` = (`Speak Spanish`/ Total) * 100,
`% not speaking spanish` = (100 - `% speaking spanish`),
`% speaking api` = (`Speak Asian and Pacific Island languages` / Total) * 100
) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>%
filter(!is.na(device_count)) %>%
mutate(log_perc = log(`% speaking english < well`))
# loading in age data - specifically looking at percentage 65+ and percentage <30
sj_age_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B01001)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(
key = "variable",
value = "estimate",
- blockgroup
) %>%
mutate(
label = acs_vars$label[match(variable,acs_vars$name)]
) %>%
select(-variable) %>%
separate(
label,
into = c(NA,NA,"sex","age"),
sep = "!!"
) %>% filter(!is.na(age)) %>%
mutate(elderly = ifelse(age %in% c("65 and 66 years", "67 to 69 years", "70 to 74 years", "75 to 79 years", "80 to 84 years", "85 years and over"), estimate, NA), `less than 30` = ifelse(age %in% c("Under 5 years", "5 to 9 years", "10 to 14 years", "15 to 17 years", "18 and 19 years", "20 years", "21 years", "22 to 24 years", "25 to 29 years"), estimate, NA)) %>%
group_by(blockgroup) %>%
summarize(elderly = sum(elderly, na.rm = T), `less than 30` = sum(`less than 30`, na.rm = T), total = sum(estimate, na.rm = T)) %>%
mutate(`percent elderly` = elderly*100 / total, `percent less than 30` = `less than 30`*100 / total, `percent nonelderly` = (100 - `percent elderly`)) %>%
left_join(sj_median_income_by_block %>% dplyr::select(-Median_Income)) %>%
filter(!is.na(device_count))
# get data on vehicles available as vehicles allocation
sj_vehicles_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B992512)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
dplyr::select(B992512_001E, blockgroup) %>%
rename(total_vehicles = B992512_001E, blockgroup = blockgroup) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
mutate(`vehicles per capita` = total_vehicles / total) %>%
filter(!is.na(device_count))
# also get data on vehicles available as households without a vehicle
sj_no_vehicles_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B25044)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, NA,"vehicles"), sep = "!!") %>%
filter(!is.na(vehicles)) %>%
group_by(blockgroup, vehicles) %>%
summarize(grouped_vehicles = sum(estimate)) %>%
spread(key = vehicles, value = grouped_vehicles) %>%
mutate(total_nums = `1 vehicle available` + `2 vehicles available` + `3 vehicles available` + `4 vehicles available` + `5 or more vehicles available` + `No vehicle available`, `percent no vehicles` = `No vehicle available`*100 / total_nums, `percent with vehicles` = (100-`percent no vehicles`)) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# get data on occupants per room
sj_occupants_per_room_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B25014)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, NA,"occupants per room"), sep = "!!") %>%
filter(!is.na(`occupants per room`)) %>%
group_by(blockgroup, `occupants per room`) %>%
summarize(estimate_tot = sum(estimate)) %>%
spread(key = `occupants per room`, value = estimate_tot) %>%
mutate(total_nums = `0.50 or less occupants per room` + `0.51 to 1.00 occupants per room` + `1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`, `percent 1 or more` = (`1.01 to 1.50 occupants per room` + `1.51 to 2.00 occupants per room` + `2.01 or more occupants per room`) * 100/ total_nums, `percent less than 1` = (100-`percent 1 or more`)) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
In the plots below, I show the selected variables against percent of devices completely at home since the shelter-in-place order started, as well as against percent of devices pre-shelter-in-place for comparison.
Age:
# age
sj_age_by_block %>%
ggplot(aes(
x = `percent less than 30`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents younger than 30",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Young Age Groups"
)
young_model <- lm(sj_age_by_block$`% not completely at home` ~ sj_age_by_block$`percent less than 30`)
summary(young_model)
##
## Call:
## lm(formula = sj_age_by_block$`% not completely at home` ~ sj_age_by_block$`percent less than 30`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.505 -4.978 -0.362 4.274 39.808
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.49223 1.51348 29.397 < 2e-16 ***
## sj_age_by_block$`percent less than 30` 0.17973 0.03836 4.685 3.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.299 on 567 degrees of freedom
## Multiple R-squared: 0.03726, Adjusted R-squared: 0.03557
## F-statistic: 21.95 on 1 and 567 DF, p-value: 3.513e-06
sj_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
ggplot(aes(
x = `percent elderly`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents 65 and older",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Elderly Population"
)
elderly_model <- lm(`% not completely at home` ~ `percent elderly`, sj_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `percent elderly`,
## data = sj_age_by_block %>% filter(`percent elderly` < 50))
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.542 -5.120 -0.472 4.248 36.658
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.65024 0.78297 68.52 <2e-16 ***
## `percent elderly` -0.17733 0.05406 -3.28 0.0011 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.361 on 564 degrees of freedom
## Multiple R-squared: 0.01872, Adjusted R-squared: 0.01698
## F-statistic: 10.76 on 1 and 564 DF, p-value: 0.001102
# compare this to pre-shelter-in-place behavior
sj_age_by_block %>%
ggplot(aes(
x = `percent less than 30`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents younger than 30",
y = "Percent devices leaving home pre-shelter-in-place",
title = "San Jose: Staying at Home and Young Age Groups Pre Shelter-in-Place"
)
young_model2 <- lm(sj_age_by_block$`% not completely at home pre shelter` ~ sj_age_by_block$`percent less than 30`)
summary(young_model2)
##
## Call:
## lm(formula = sj_age_by_block$`% not completely at home pre shelter` ~
## sj_age_by_block$`percent less than 30`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.1939 -2.8160 -0.1557 2.9950 16.7071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 81.87032 0.82253 99.53 < 2e-16 ***
## sj_age_by_block$`percent less than 30` -0.11072 0.02085 -5.31 1.57e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.51 on 567 degrees of freedom
## Multiple R-squared: 0.04738, Adjusted R-squared: 0.0457
## F-statistic: 28.2 on 1 and 567 DF, p-value: 1.573e-07
sj_age_by_block %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
ggplot(aes(
x = `percent elderly`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents 65 and older",
y = "Percent devices leaving home on weekdays pre-shelter-in-place",
title = "San Jose: Staying at Home and Elderly Population Pre Shelter-in-Place"
)
elderly_model2 <- lm(`% not completely at home pre shelter` ~ `percent elderly`, sj_age_by_block %>% filter(`percent elderly` < 50))
summary(elderly_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `percent elderly`,
## data = sj_age_by_block %>% filter(`percent elderly` < 50))
##
## Residuals:
## Min 1Q Median 3Q Max
## -27.236 -2.830 -0.158 3.145 14.296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.9045 0.4257 178.295 < 2e-16 ***
## `percent elderly` 0.1329 0.0294 4.522 7.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.546 on 564 degrees of freedom
## Multiple R-squared: 0.03499, Adjusted R-squared: 0.03328
## F-statistic: 20.45 on 1 and 564 DF, p-value: 7.466e-06
Income:
# income - less than $75000
sj_ami_by_block %>%
ggplot(aes(
x = `% over 75,000`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Households Above 50% AMI"
)
income_75_model <- lm(`% not completely at home` ~ `% over 75,000`, sj_ami_by_block)
summary(income_75_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `% over 75,000`, data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.389 -4.795 -0.606 4.185 34.925
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.35928 1.11518 57.71 <2e-16 ***
## `% over 75,000` -0.20909 0.01719 -12.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.444 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2072, Adjusted R-squared: 0.2058
## F-statistic: 148 on 1 and 566 DF, p-value: < 2.2e-16
# income - less than $100000
sj_ami_by_block %>%
ggplot(aes(
x = `% over 100,000`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $100,000 (80% AMI) annually",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Households Below 80% AMI"
)
income_100_model <- lm(`% not completely at home` ~ `% over 100,000`, sj_ami_by_block)
summary(income_100_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `% over 100,000`, data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.762 -4.639 -0.500 4.176 33.309
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.78423 0.87244 70.82 <2e-16 ***
## `% over 100,000` -0.20475 0.01599 -12.80 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.362 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2246, Adjusted R-squared: 0.2232
## F-statistic: 163.9 on 1 and 566 DF, p-value: < 2.2e-16
# income - less than $125000
sj_ami_by_block %>%
ggplot(aes(
x = `% over 125,000`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $125,000 annually",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Households Below $125,000"
)
income_125_model <- lm(`% not completely at home` ~ `% over 125,000`, sj_ami_by_block)
summary(income_125_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `% over 125,000`, data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.072 -4.669 -0.845 4.018 32.374
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.00338 0.73524 81.61 <2e-16 ***
## `% over 125,000` -0.21065 0.01623 -12.98 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.339 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2294, Adjusted R-squared: 0.228
## F-statistic: 168.5 on 1 and 566 DF, p-value: < 2.2e-16
# compare to pre shelter in place
sj_ami_by_block %>%
ggplot(aes(
x = `% over 75,000`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
y = "Percent devices leaving home on weekdays pre-shelter-in-place",
title = "San Jose: Staying at Home and Households Above 50% AMI Pre Shelter-in-Place"
)
income_75_model2 <- lm(`% not completely at home pre shelter` ~ `% over 75,000`, sj_ami_by_block)
summary(income_75_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% over 75,000`,
## data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.4357 -2.7003 -0.1437 2.7764 16.6680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.61447 0.65712 110.504 < 2e-16 ***
## `% over 75,000` 0.08029 0.01013 7.926 1.21e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.386 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.09991, Adjusted R-squared: 0.09832
## F-statistic: 62.83 on 1 and 566 DF, p-value: 1.206e-14
# income - less than $100000
sj_ami_by_block %>%
ggplot(aes(
x = `% over 100,000`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $100,000 (80% AMI) annually",
y = "Percent devices leaving home on weekdays pre-shelter-in-place",
title = "San Jose: Staying Home and Households Below 80% AMI Pre Shelter-in-Place"
)
income_100_model2 <- lm(`% not completely at home pre shelter` ~ `% over 100,000`, sj_ami_by_block)
summary(income_100_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% over 100,000`,
## data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.5034 -2.6406 0.0803 2.5599 16.9387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.26132 0.51177 143.152 <2e-16 ***
## `% over 100,000` 0.08532 0.00938 9.096 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.319 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1275, Adjusted R-squared: 0.126
## F-statistic: 82.73 on 1 and 566 DF, p-value: < 2.2e-16
# over 125000
sj_ami_by_block %>%
ggplot(aes(
x = `% over 125,000`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $125,000 annually",
y = "Percent devices leaving home on weekdays pre-shelter-in-place",
title = "San Jose: Social Distancing and Households Below $125,000 Pre Shelter-in-Place"
)
income_125_model2 <- lm(`% not completely at home pre shelter` ~ `% over 125,000`, sj_ami_by_block)
summary(income_125_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% over 125,000`,
## data = sj_ami_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.353 -2.556 0.022 2.522 16.560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.640242 0.425069 173.2 <2e-16 ***
## `% over 125,000` 0.096607 0.009382 10.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.243 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1578, Adjusted R-squared: 0.1563
## F-statistic: 106 on 1 and 566 DF, p-value: < 2.2e-16
Language:
# language
sj_lang_by_block %>%
ggplot(aes(
x = `% speaking english > well`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals speaking English well",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and English Language Ability"
)
english_ability_model <- lm(`% not completely at home` ~ `% speaking english > well`, sj_lang_by_block)
summary(english_ability_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `% speaking english > well`,
## data = sj_lang_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.467 -5.070 -0.542 3.887 39.230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.28491 3.36107 20.019 < 2e-16 ***
## `% speaking english > well` -0.17915 0.03769 -4.754 2.53e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.294 on 567 degrees of freedom
## Multiple R-squared: 0.03833, Adjusted R-squared: 0.03663
## F-statistic: 22.6 on 1 and 567 DF, p-value: 2.534e-06
sj_lang_by_block %>%
ggplot(aes(
x = `% not speaking spanish`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals not speaking Spanish",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Spanish Language Ability"
)
spanish_speaking_model <- lm(`% not completely at home` ~ `% not speaking spanish`, sj_lang_by_block)
summary(spanish_speaking_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `% not speaking spanish`,
## data = sj_lang_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.434 -4.585 -0.796 3.568 38.341
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.61990 1.32137 48.147 <2e-16 ***
## `% not speaking spanish` -0.15727 0.01646 -9.555 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.849 on 567 degrees of freedom
## Multiple R-squared: 0.1387, Adjusted R-squared: 0.1372
## F-statistic: 91.29 on 1 and 567 DF, p-value: < 2.2e-16
# compare to pre shelter in place
sj_lang_by_block %>%
ggplot(aes(
x = `% speaking english > well`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals speaking English well",
y = "Percent devices leaving home on weekdays pre-shelter-in-place",
title = "San Jose: Staying at Home and English Language Ability Pre Shelter-in-Place"
)
english_ability_model2 <- lm(`% not completely at home pre shelter` ~ `% speaking english > well`, sj_lang_by_block)
summary(english_ability_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% speaking english > well`,
## data = sj_lang_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.9364 -2.4342 0.0388 3.0316 12.3011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.84131 1.73337 35.100 <2e-16 ***
## `% speaking english > well` 0.18913 0.01943 9.732 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.277 on 567 degrees of freedom
## Multiple R-squared: 0.1431, Adjusted R-squared: 0.1416
## F-statistic: 94.7 on 1 and 567 DF, p-value: < 2.2e-16
sj_lang_by_block %>%
ggplot(aes(
x = `% not speaking spanish`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals not speaking Spanish",
y = "Percent devices leaving home on weekdays pre shelter-in-place",
title = "San Jose: Staying at Home and Spanish Language Ability Pre Shelter-in-Place"
)
spanish_speaking_model2 <- lm(`% not completely at home pre shelter` ~ `% not speaking spanish`, sj_lang_by_block)
summary(spanish_speaking_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `% not speaking spanish`,
## data = sj_lang_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.793 -2.540 -0.002 2.680 11.988
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 71.228200 0.726831 97.998 <2e-16 ***
## `% not speaking spanish` 0.082206 0.009054 9.079 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.318 on 567 degrees of freedom
## Multiple R-squared: 0.1269, Adjusted R-squared: 0.1254
## F-statistic: 82.43 on 1 and 567 DF, p-value: < 2.2e-16
Occupants per room:
# occupants per room
sj_occupants_per_room_by_block %>%
ggplot(aes(
x = `percent less than 1`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with 1 or fewer occupant per room",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Room Occupancy"
)
occupants_model <- lm(`% not completely at home` ~ `percent less than 1`, sj_occupants_per_room_by_block)
summary(occupants_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `percent less than 1`,
## data = sj_occupants_per_room_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.657 -4.937 -0.288 3.901 35.043
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.49553 2.95264 23.875 < 2e-16 ***
## `percent less than 1` -0.21239 0.03252 -6.532 1.45e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.062 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.07009, Adjusted R-squared: 0.06845
## F-statistic: 42.66 on 1 and 566 DF, p-value: 1.451e-10
# compare to pre shelter in place
sj_occupants_per_room_by_block %>%
ggplot(aes(
x = `percent less than 1`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with 1 or fewer occupant per room",
y = "Percent devices leaving home on weekdays pre shelter-in-place",
title = "San Jose: Staying at Home and Room Occupancy Pre Shelter-in-Place"
)
occupants_model2 <- lm(`% not completely at home pre shelter` ~ `percent less than 1`, sj_occupants_per_room_by_block)
summary(occupants_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `percent less than 1`,
## data = sj_occupants_per_room_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.3246 -2.6506 -0.2808 2.7536 17.0509
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 62.88485 1.57437 39.943 <2e-16 ***
## `percent less than 1` 0.16329 0.01734 9.418 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.299 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1355, Adjusted R-squared: 0.134
## F-statistic: 88.7 on 1 and 566 DF, p-value: < 2.2e-16
Vehicle ownership:
# vehicles
sj_vehicles_by_block %>%
ggplot(aes(
x = `vehicles per capita`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Number of vehicles per capita",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Vehicles Per Capita"
)
# vehicles - percent with no vehicles
sj_no_vehicles_by_block %>%
ggplot(aes(
x = `percent with vehicles`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with vehicles available",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Vehicle Availability"
)
vehicles_model <- lm(`% not completely at home` ~ `percent with vehicles`, sj_no_vehicles_by_block)
summary(vehicles_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `percent with vehicles`,
## data = sj_no_vehicles_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.500 -5.226 -0.406 4.579 38.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.71497 5.14420 14.72 < 2e-16 ***
## `percent with vehicles` -0.25615 0.05393 -4.75 2.59e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.199 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.03833, Adjusted R-squared: 0.03663
## F-statistic: 22.56 on 1 and 566 DF, p-value: 2.587e-06
# compare to pre shelter in place
sj_no_vehicles_by_block %>%
ggplot(aes(
x = `percent with vehicles`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with vehicles available",
y = "Percent devices leaving home on weekdays pre shelter-in-place",
title = "San Jose: Social Distancing and Vehicle Availability Pre Shelter-in-Place"
)
vehicles_model2 <- lm(`% not completely at home pre shelter` ~ `percent with vehicles`, sj_no_vehicles_by_block)
summary(vehicles_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `percent with vehicles`,
## data = sj_no_vehicles_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.5618 -2.9606 -0.0694 3.0006 12.6053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.25942 2.83717 22.297 < 2e-16 ***
## `percent with vehicles` 0.15084 0.02975 5.071 5.37e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.522 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.04346, Adjusted R-squared: 0.04177
## F-statistic: 25.72 on 1 and 566 DF, p-value: 5.37e-07
Multiple regression analysis with income, age, language, and occupants per room
# multiple regression
modeltest <- lm(sj_ami_by_block$`% not completely at home` ~ sj_ami_by_block$`% over 125,000` + sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english > well` + sj_occupants_per_room_by_block$`percent less than 1`)
summary(modeltest)
##
## Call:
## lm(formula = sj_ami_by_block$`% not completely at home` ~ sj_ami_by_block$`% over 125,000` +
## sj_age_by_block$`percent less than 30` + sj_lang_by_block$`% speaking english > well` +
## sj_occupants_per_room_by_block$`percent less than 1`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.490 -4.658 -0.797 3.973 32.270
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 50.72548 4.77236
## sj_ami_by_block$`% over 125,000` -0.23965 0.02152
## sj_age_by_block$`percent less than 30` 0.01581 0.04209
## sj_lang_by_block$`% speaking english > well` 0.13432 0.04611
## sj_occupants_per_room_by_block$`percent less than 1` -0.02274 0.04614
## t value Pr(>|t|)
## (Intercept) 10.629 < 2e-16 ***
## sj_ami_by_block$`% over 125,000` -11.135 < 2e-16 ***
## sj_age_by_block$`percent less than 30` 0.376 0.70739
## sj_lang_by_block$`% speaking english > well` 2.913 0.00372 **
## sj_occupants_per_room_by_block$`percent less than 1` -0.493 0.62234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.297 on 563 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2423, Adjusted R-squared: 0.2369
## F-statistic: 45 on 4 and 563 DF, p-value: < 2.2e-16
I also consider education and internet access, based on previous research. Education:
sj_education_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B15003)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "education level"), sep = "!!") %>%
mutate(`education level` = replace_na(`education level`, "total_educ")) %>% # if the education level field is NA, this corresponded to the total number in that blockgroup
spread(key = `education level`, value = estimate) %>%
mutate(`percent associates or higher` = (`Associate's degree` + `Bachelor's degree` + `Doctorate degree` + `Master's degree`)*100/total_educ, `percent less than associates` = 100-`percent associates or higher`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# plotting
sj_education_by_block %>%
ggplot(aes(
x = `percent associates or higher`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people with an degree at Associate's level or higher",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Education"
)
educ_model <- lm(`% not completely at home` ~ `percent associates or higher`, sj_education_by_block)
summary(educ_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `percent associates or higher`,
## data = sj_education_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.619 -4.747 -0.950 3.300 43.002
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.4003 0.8408 71.84 <2e-16 ***
## `percent associates or higher` -0.1910 0.0165 -11.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.606 on 567 degrees of freedom
## Multiple R-squared: 0.1912, Adjusted R-squared: 0.1898
## F-statistic: 134.1 on 1 and 567 DF, p-value: < 2.2e-16
# compare to pre shelter in place
sj_education_by_block %>%
ggplot(aes(
x = `percent associates or higher`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people with an degree at Associate's level or higher",
y = "Percent devices leaving home on weekdays pre-shelter-in-place",
title = "San Jose: Social Distancing and Education Pre Shelter-in-Place"
)
educ_model2 <- lm(`% not completely at home pre shelter` ~ `percent associates or higher`, sj_education_by_block)
summary(educ_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `percent associates or higher`,
## data = sj_education_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.2224 -2.5458 0.0672 2.7859 13.9180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.547853 0.476189 154.45 <2e-16 ***
## `percent associates or higher` 0.086343 0.009344 9.24 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.308 on 567 degrees of freedom
## Multiple R-squared: 0.1309, Adjusted R-squared: 0.1293
## F-statistic: 85.39 on 1 and 567 DF, p-value: < 2.2e-16
Internet:
sj_internet_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B28002)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "subscription", "type", "additional"), sep = "!!") %>%
filter(is.na(subscription) | (type == "Broadband such as cable, fiber optic or DSL") & is.na(additional)) %>%
mutate(type = replace_na(type, "total_num")) %>%
dplyr::select(blockgroup, type, estimate) %>%
spread(key = type, value = estimate) %>%
mutate(`percent high speed` = `Broadband such as cable, fiber optic or DSL`*100/total_num, `percent no high speed` = 100-`percent high speed`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# plotting
sj_internet_by_block %>%
ggplot(aes(
x = `percent high speed`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with broadband such as cable, fiber optic or DSL",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and High Speed Internet"
)
internet_model <- lm(`% not completely at home` ~ `percent high speed`, sj_internet_by_block)
summary(internet_model)
##
## Call:
## lm(formula = `% not completely at home` ~ `percent high speed`,
## data = sj_internet_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.884 -4.676 -0.549 3.996 38.086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 72.72612 2.22683 32.659 <2e-16 ***
## `percent high speed` -0.26512 0.02731 -9.709 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.741 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1428, Adjusted R-squared: 0.1413
## F-statistic: 94.26 on 1 and 566 DF, p-value: < 2.2e-16
# compare to pre-shelter-in-place behavior
sj_internet_by_block %>%
ggplot(aes(
x = `percent high speed`,
y = `% not completely at home pre shelter`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with broadband such as cable, fiber optic or DSL",
y = "Percent devices leaving home on weekdays pre-shelter-in-place",
title = "San Jose: Social Distancing and High Speed Internet Pre Shelter-in-Place"
)
internet_model2 <- lm(`% not completely at home pre shelter` ~ `percent high speed`, sj_internet_by_block)
summary(internet_model2)
##
## Call:
## lm(formula = `% not completely at home pre shelter` ~ `percent high speed`,
## data = sj_internet_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.5915 -2.8440 -0.1157 2.8288 16.8750
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 69.1067 1.2800 53.990 < 2e-16 ***
## `percent high speed` 0.1055 0.0157 6.719 4.48e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.449 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.07386, Adjusted R-squared: 0.07223
## F-statistic: 45.14 on 1 and 566 DF, p-value: 4.477e-11
Do another multiple regression analysis, this time with education and income
educ_income_model <- lm(sj_ami_by_block$`% not completely at home` ~ sj_ami_by_block$`% over 125,000` + sj_education_by_block$`percent associates or higher`)
summary(educ_income_model)
##
## Call:
## lm(formula = sj_ami_by_block$`% not completely at home` ~ sj_ami_by_block$`% over 125,000` +
## sj_education_by_block$`percent associates or higher`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.731 -4.702 -1.052 3.700 32.133
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 61.47695 0.82679
## sj_ami_by_block$`% over 125,000` -0.15122 0.02258
## sj_education_by_block$`percent associates or higher` -0.08302 0.02219
## t value Pr(>|t|)
## (Intercept) 74.356 < 2e-16 ***
## sj_ami_by_block$`% over 125,000` -6.697 5.15e-11 ***
## sj_education_by_block$`percent associates or higher` -3.741 0.000202 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.257 on 565 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.248, Adjusted R-squared: 0.2454
## F-statistic: 93.18 on 2 and 565 DF, p-value: < 2.2e-16
Try with internet and income
educ_income_model <- lm(sj_ami_by_block$`% not completely at home` ~ sj_ami_by_block$`% over 125,000` + sj_internet_by_block$`percent high speed`)
summary(educ_income_model)
##
## Call:
## lm(formula = sj_ami_by_block$`% not completely at home` ~ sj_ami_by_block$`% over 125,000` +
## sj_internet_by_block$`percent high speed`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.166 -4.531 -0.761 4.120 32.120
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.16907 2.28887 28.472 < 2e-16
## sj_ami_by_block$`% over 125,000` -0.17772 0.02127 -8.356 5.03e-16
## sj_internet_by_block$`percent high speed` -0.08082 0.03393 -2.382 0.0175
##
## (Intercept) ***
## sj_ami_by_block$`% over 125,000` ***
## sj_internet_by_block$`percent high speed` *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.309 on 565 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2371, Adjusted R-squared: 0.2344
## F-statistic: 87.78 on 2 and 565 DF, p-value: < 2.2e-16
Now I consider looking at correlations with the reduction in percent of devices leaving the home since shelter-in-place started relative to the pre-shelter-in-place levels.
# collect the demographic variables
sj_dem_distancing <- sj_internet_by_block %>%
dplyr::select(`percent high speed`, `% not completely at home`, `% Completely at Home`, blockgroup) %>%
left_join(sj_education_by_block %>% dplyr::select(blockgroup, `percent associates or higher`)) %>%
left_join(sj_ami_by_block %>% dplyr::select(blockgroup, `% over 125,000`)) %>%
left_join(sj_ami_by_block %>% dplyr::select(blockgroup, `% over 100,000`)) %>%
left_join(sj_ami_by_block %>% dplyr::select(blockgroup, `% over 75,000`)) %>%
left_join(sj_age_by_block %>% dplyr::select(blockgroup, `percent less than 30`)) %>%
left_join(sj_age_by_block %>% dplyr::select(blockgroup, `percent elderly`)) %>%
left_join(sj_lang_by_block %>% dplyr::select(blockgroup, `% not speaking spanish`)) %>%
left_join(sj_lang_by_block %>% dplyr::select(blockgroup, `% speaking english > well`)) %>%
left_join(sj_no_vehicles_by_block %>% dplyr::select(blockgroup, `percent with vehicles`)) %>%
left_join(sj_occupants_per_room_by_block %>% dplyr::select(blockgroup, `percent less than 1`))
sj_dem_distancing_pre_post <- sj_dem_distancing %>%
left_join(sj_internet_by_block %>% dplyr::select(`% not completely at home pre shelter`, `% Completely at Home Pre Shelter`, blockgroup)) %>%
mutate(`% increase in staying completely home` = `% Completely at Home` - `% Completely at Home Pre Shelter`)
saveRDS(sj_dem_distancing_pre_post, "/Users/simonespeizer/Documents/2020 Spring Quarter/CEE 218Z/covid19/Simone_Speizer/sj_socialdistancing_demdata_prepostdifs_manyvars.rds")
# sj_dem_distancing_pre_post <- readRDS("/Users/simonespeizer/Documents/2020 Spring Quarter/CEE 218Z/covid19/Simone_Speizer/sj_socialdistancing_demdata_prepostdifs_manyvars.rds")
Age:
# age
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `percent less than 30`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents younger than 30",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and Young Age Groups"
)
young_model_dif <- lm(sj_dem_distancing_pre_post$`% increase in staying completely home` ~ sj_dem_distancing_pre_post$`percent less than 30`)
summary(young_model_dif)
##
## Call:
## lm(formula = sj_dem_distancing_pre_post$`% increase in staying completely home` ~
## sj_dem_distancing_pre_post$`percent less than 30`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.099 -5.332 -0.019 5.441 30.340
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 37.37809 1.73272 21.572
## sj_dem_distancing_pre_post$`percent less than 30` -0.29045 0.04392 -6.613
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## sj_dem_distancing_pre_post$`percent less than 30` 8.72e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.501 on 567 degrees of freedom
## Multiple R-squared: 0.0716, Adjusted R-squared: 0.06997
## F-statistic: 43.73 on 1 and 567 DF, p-value: 8.723e-11
sj_dem_distancing_pre_post %>% filter(`percent elderly` < 50) %>% # get rid of extreme outliers
ggplot(aes(
x = `percent elderly`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of residents 65 and older",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and Elderly Population"
)
elderly_model_dif <- lm(`% increase in staying completely home` ~ `percent elderly`, sj_dem_distancing_pre_post %>% filter(`percent elderly` < 50))
summary(elderly_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent elderly`,
## data = sj_dem_distancing_pre_post %>% filter(`percent elderly` <
## 50))
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.371 -5.656 -0.321 6.062 31.123
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.25429 0.90241 24.661 < 2e-16 ***
## `percent elderly` 0.31026 0.06231 4.979 8.49e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.636 on 564 degrees of freedom
## Multiple R-squared: 0.04211, Adjusted R-squared: 0.04041
## F-statistic: 24.79 on 1 and 564 DF, p-value: 8.494e-07
Income:
# income - less than $75000
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `% over 75,000`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and Households Above 50% AMI"
)
income_75_model_dif <- lm(`% increase in staying completely home` ~ `% over 75,000`, sj_dem_distancing_pre_post)
summary(income_75_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `% over 75,000`,
## data = sj_dem_distancing_pre_post)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.473 -4.471 0.535 4.900 24.178
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.25518 1.23696 6.674 5.95e-11 ***
## `% over 75,000` 0.28938 0.01907 15.177 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.257 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2893, Adjusted R-squared: 0.288
## F-statistic: 230.4 on 1 and 566 DF, p-value: < 2.2e-16
# income - less than $100000
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `% over 100,000`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $100,000 (80% AMI) annually",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and Households Below 80% AMI"
)
income_100_model_dif <- lm(`% increase in staying completely home` ~ `% over 100,000`, sj_dem_distancing_pre_post)
summary(income_100_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `% over 100,000`,
## data = sj_dem_distancing_pre_post)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.297 -4.436 0.906 5.318 20.897
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.47709 0.95110 12.07 <2e-16 ***
## `% over 100,000` 0.29007 0.01743 16.64 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.026 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3285, Adjusted R-squared: 0.3273
## F-statistic: 276.9 on 1 and 566 DF, p-value: < 2.2e-16
# income - less than $125000
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `% over 125,000`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $125,000 annually",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and Households Below $125,000"
)
income_125_model_dif <- lm(`% increase in staying completely home` ~ `% over 125,000`, sj_dem_distancing_pre_post)
summary(income_125_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `% over 125,000`,
## data = sj_dem_distancing_pre_post)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.150 -4.000 0.622 5.134 21.377
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.63687 0.78759 17.32 <2e-16 ***
## `% over 125,000` 0.30726 0.01738 17.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.862 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3556, Adjusted R-squared: 0.3545
## F-statistic: 312.4 on 1 and 566 DF, p-value: < 2.2e-16
Language:
# language
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `% speaking english > well`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals speaking English well",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and English Language Ability"
)
english_ability_model_dif <- lm(`% increase in staying completely home` ~ `% speaking english > well`, sj_dem_distancing_pre_post)
summary(english_ability_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `% speaking english > well`,
## data = sj_dem_distancing_pre_post)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.714 -4.743 0.360 5.067 29.631
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.44360 3.75014 -1.718 0.0863 .
## `% speaking english > well` 0.36828 0.04205 8.759 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.254 on 567 degrees of freedom
## Multiple R-squared: 0.1192, Adjusted R-squared: 0.1176
## F-statistic: 76.72 on 1 and 567 DF, p-value: < 2.2e-16
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `% not speaking spanish`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of individuals not speaking Spanish",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and Spanish Language Ability"
)
spanish_speaking_model_dif <- lm(`% increase in staying completely home` ~ `% not speaking spanish`, sj_dem_distancing_pre_post)
summary(spanish_speaking_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `% not speaking spanish`,
## data = sj_dem_distancing_pre_post)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.698 -3.803 0.725 5.249 25.544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.60830 1.45033 5.246 2.2e-07 ***
## `% not speaking spanish` 0.23948 0.01807 13.255 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.615 on 567 degrees of freedom
## Multiple R-squared: 0.2366, Adjusted R-squared: 0.2352
## F-statistic: 175.7 on 1 and 567 DF, p-value: < 2.2e-16
Occupants per room:
# occupants per room
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `percent less than 1`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with 1 or fewer occupant per room",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and Room Occupancy"
)
occupants_model_dif <- lm(`% increase in staying completely home` ~ `percent less than 1`, sj_dem_distancing_pre_post)
summary(occupants_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent less than 1`,
## data = sj_dem_distancing_pre_post)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.905 -4.893 0.459 5.144 27.142
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.61068 3.28780 -2.315 0.021 *
## `percent less than 1` 0.37568 0.03621 10.376 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.978 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1598, Adjusted R-squared: 0.1583
## F-statistic: 107.7 on 1 and 566 DF, p-value: < 2.2e-16
Vehicle ownership:
# vehicles - percent with no vehicles
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `percent with vehicles`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with vehicles available",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and Vehicle Availability"
)
vehicles_model_dif <- lm(`% increase in staying completely home` ~ `percent with vehicles`, sj_dem_distancing_pre_post)
summary(vehicles_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent with vehicles`,
## data = sj_dem_distancing_pre_post)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.983 -5.915 0.056 5.696 29.934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.45555 5.92452 -2.102 0.036 *
## `percent with vehicles` 0.40699 0.06211 6.552 1.27e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.443 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.07051, Adjusted R-squared: 0.06886
## F-statistic: 42.93 on 1 and 566 DF, p-value: 1.274e-10
Education:
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `percent associates or higher`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people with an degree at Associate's level or higher",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and Education"
)
educ_model_dif <- lm(`% increase in staying completely home` ~ `percent associates or higher`, sj_dem_distancing_pre_post)
summary(educ_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent associates or higher`,
## data = sj_dem_distancing_pre_post)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.284 -3.059 1.107 5.124 22.190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.14752 0.91413 14.38 <2e-16 ***
## `percent associates or higher` 0.27737 0.01794 15.46 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.27 on 567 degrees of freedom
## Multiple R-squared: 0.2966, Adjusted R-squared: 0.2954
## F-statistic: 239.1 on 1 and 567 DF, p-value: < 2.2e-16
Internet:
sj_dem_distancing_pre_post %>%
ggplot(aes(
x = `percent high speed`,
y = `% increase in staying completely home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with broadband such as cable, fiber optic or DSL",
y = "Percent increase in staying completely at home since shelter-in-place",
title = "San Jose: Social Distancing and High Speed Internet"
)
internet_model_dif <- lm(`% increase in staying completely home` ~ `percent high speed`, sj_dem_distancing_pre_post)
summary(internet_model_dif)
##
## Call:
## lm(formula = `% increase in staying completely home` ~ `percent high speed`,
## data = sj_dem_distancing_pre_post)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.453 -4.843 0.409 5.396 26.676
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.61938 2.51490 -1.439 0.151
## `percent high speed` 0.37057 0.03084 12.016 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.742 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2033, Adjusted R-squared: 0.2019
## F-statistic: 144.4 on 1 and 566 DF, p-value: < 2.2e-16
Multiple regression analysis with income, internet, Spanish language ability, and occupants per room.
difs_model <- lm(sj_dem_distancing_pre_post$`% increase in staying completely home` ~ sj_dem_distancing_pre_post$`% over 125,000` + sj_dem_distancing_pre_post$`% not speaking spanish` + sj_dem_distancing_pre_post$`percent less than 1` + sj_dem_distancing_pre_post$`percent high speed`)
summary(difs_model)
##
## Call:
## lm(formula = sj_dem_distancing_pre_post$`% increase in staying completely home` ~
## sj_dem_distancing_pre_post$`% over 125,000` + sj_dem_distancing_pre_post$`% not speaking spanish` +
## sj_dem_distancing_pre_post$`percent less than 1` + sj_dem_distancing_pre_post$`percent high speed`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -37.260 -3.786 0.914 4.997 21.342
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 9.35496 3.74626 2.497
## sj_dem_distancing_pre_post$`% over 125,000` 0.24015 0.02483 9.671
## sj_dem_distancing_pre_post$`% not speaking spanish` 0.09663 0.02686 3.598
## sj_dem_distancing_pre_post$`percent less than 1` -0.04485 0.04817 -0.931
## sj_dem_distancing_pre_post$`percent high speed` 0.04428 0.03808 1.163
## Pr(>|t|)
## (Intercept) 0.012804 *
## sj_dem_distancing_pre_post$`% over 125,000` < 2e-16 ***
## sj_dem_distancing_pre_post$`% not speaking spanish` 0.000349 ***
## sj_dem_distancing_pre_post$`percent less than 1` 0.352196
## sj_dem_distancing_pre_post$`percent high speed` 0.245436
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.739 on 563 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3789, Adjusted R-squared: 0.3745
## F-statistic: 85.88 on 4 and 563 DF, p-value: < 2.2e-16
# another collection for pre shelter in place behavior
sj_dem_distancing_pre_shelter <- sj_dem_distancing %>%
dplyr::select(-`% not completely at home`) %>%
left_join(sj_internet_by_block %>% dplyr::select(`% not completely at home pre shelter`, blockgroup))
# relabel column for leaving home
colnames(sj_dem_distancing_pre_shelter)[14] <- "% not completely at home"
sj_dem_distancing[is.na(sj_dem_distancing)] <- 0
sj_dem_distancing_pre_shelter[is.na(sj_dem_distancing_pre_shelter)] <- 0
# add column indicating before or after shelter in place, then bind the two sets of data
sj_dem_distancing_pre_shelter <- sj_dem_distancing_pre_shelter %>%
mutate(
income_trendline =
fitted(lm(sj_dem_distancing_pre_shelter$`% not completely at home` ~ sj_dem_distancing_pre_shelter$`% over 125,000`))
) %>%
cbind(`Before or After Shelter-in-Place` = "before")
sj_dem_distancing <-
sj_dem_distancing %>%
mutate(
income_trendline =
fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`% over 125,000`))
) %>%
cbind(`Before or After Shelter-in-Place` = "after") %>%
rbind(sj_dem_distancing_pre_shelter)
# try animating
fig <-
plot_ly (sj_dem_distancing) %>%
add_trace(
x = ~`% over 125,000`,
y = ~`% not completely at home`,
frame = ~`Before or After Shelter-in-Place`,
type = 'scatter',
mode = 'markers'
) %>%
add_trace(
x = ~`% over 125,000`,
y = ~income_trendline,
type = 'scatter',
mode = 'lines',
line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
frame = ~`Before or After Shelter-in-Place`
) %>%
animation_button(visible = F)
fig
# # save as rds
# saveRDS(sj_dem_distancing, "/Users/simonespeizer/pCloud Drive/Shared/SFBI/Restricted Data Library/Safegraph/covid19analysis/sj_sd_dem_data.rds")
# fig <- plot_ly(sj_dem_distancing) %>%
# add_trace(
# x = ~`% over 125,000`,
# y = ~`% not completely at home`,
# frame = ~`Before or After Shelter-in-Place`,
# type = "scatter",
# mode = "markers",
# name = "Under $125,000",
# marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
# visible = T
# ) %>%
# add_trace(
# x = ~`% over 125,000`,
# y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`% over 125,000`)),
# name = 'trendline',
# mode = 'lines',
# line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
# frame = ~`Before or After Shelter-in-Place`,
# visible = F
# ) %>%
# add_trace(
# x = ~`% not speaking spanish`,
# y = ~`% not completely at home`,
# frame = ~`Before or After Shelter-in-Place`,
# name = "speak spanish",
# marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
# visible = F
# ) %>%
# add_trace(
# x = ~`% not speaking spanish`,
# y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`% not speaking spanish`)),
# name = 'trendline',
# mode = 'lines',
# line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
# frame = ~`Before or After Shelter-in-Place`,
# visible = F
# ) %>%
# add_trace(
# x = ~`percent associates or higher`,
# y = ~`% not completely at home`,
# frame = ~`Before or After Shelter-in-Place`,
# name = "percent higher degree",
# marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
# visible = F
# ) %>%
# add_trace(
# x = ~`percent associates or higher`,
# y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`percent associates or higher`)),
# name = 'trendline',
# mode = 'lines',
# line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
# frame = ~`Before or After Shelter-in-Place`,
# visible = F
# ) %>%
# add_trace(
# x = ~`percent high speed`,
# y = ~`% not completely at home`,
# frame = ~`Before or After Shelter-in-Place`,
# name = "percent high speed internet access",
# marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
# visible = F
# ) %>%
# add_trace(
# x = ~`percent high speed`,
# y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`percent high speed`)),
# name = 'trendline',
# mode = 'lines',
# line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
# frame = ~sj_dem_distancing$`Before or After Shelter-in-Place`,
# visible = F
# ) %>%
# add_trace(
# x = ~`percent less than 30`,
# y = ~`% not completely at home`,
# frame = ~`Before or After Shelter-in-Place`,
# name = "percent less than 30",
# marker = list(size = 5, color = 'rgba(50, 150, 200, 1)'),
# visible = F
# ) %>%
# add_trace(
# x = ~`percent less than 30`,
# y = fitted(lm(sj_dem_distancing$`% not completely at home` ~ sj_dem_distancing$`percent less than 30`)),
# name = 'trendline',
# mode = 'lines',
# line = list(size = 5, color = 'rgba(255, 165, 0, 1)'),
# frame = ~`Before or After Shelter-in-Place`,
# visible = F
# ) %>%
# layout(
# updatemenus = list(
# list(
# active = 0,
# type = 'buttons',
# buttons = list(
# list(
# label = "Households Under $125,000",
# method = 'update',
# args = list(list(visible = c(T, T, F, F, F, F, F, F, F, F)),
# list(title = "Under $125,000",
# xaxis = list(title = "% Households Under $125,000 in Income")))),
# list(
# label = "Speaking Spanish",
# method = 'update',
# args = list(list(visible = c(F, F, T, T, F, F, F, F, F, F)),
# list(title = "Not Speaking Spanish",
# xaxis = list(title = "% Residents Not Speaking Spanish")))),
# list(
# label = "Education Level",
# method = 'update',
# args= list(list(visible = c(F, F, F, F, T, T, F, F, F, F)),
# list(xaxis = list(title = "% Residents With Associate's Degree or Higher")))),
# list(
# label = "High Speed Internet",
# method = 'update',
# args= list(list(visible = c(F, F, F, F, F, F, T, T, F, F)),
# list(xaxis = list(title = "% Households With High Speed Internet Access")))),
# list(
# label = "Young Population",
# method = 'update',
# args= list(list(visible = c(F, F, F, F, F, F, T, T, F, F)),
# list(xaxis = list(title = "% Residents Under Age 30"))))
# )
# )
# ),
# yaxis = list(title = "% Residents Leaving Home",
# font = list(size = 15)),
# showlegend = FALSE
# )
# fig
Experimentation with other variables and other ways of analyzing the social distancing data. First I look at a few other possible variables. I start with units in the structure.
# try getting other variables
# get data on units in structure
sj_units_in_structure_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B25024)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "units"), sep = "!!") %>%
filter(!is.na(units)) %>%
spread(key = units, value = estimate) %>%
mutate(total_nums = `1, attached` + `1, detached` + `10 to 19` + `2` + `20 to 49`+ `3 or 4` + `5 to 9`+ `50 or more`+ `Boat, RV, van, etc.`+ `Mobile home`, `percent 20 or more` = (`20 to 49`+`50 or more`)* 100/ total_nums, `percent 1 only` = (`1, attached` + `1, detached`)*100/total_nums, `percent > 1` = 100 - `percent 1 only`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
# plot
sj_units_in_structure_by_block %>%
ggplot(aes(
x = `percent 20 or more`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of structures with more than 20 units",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and 20 or More Units Per Structure"
)
summary(lm(`% not completely at home` ~ `percent 20 or more`, sj_units_in_structure_by_block))
##
## Call:
## lm(formula = `% not completely at home` ~ `percent 20 or more`,
## data = sj_units_in_structure_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.551 -5.052 -0.451 4.274 37.149
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.15091 0.40431 126.513 <2e-16 ***
## `percent 20 or more` 0.01873 0.02026 0.924 0.356
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.354 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.001507, Adjusted R-squared: -0.0002566
## F-statistic: 0.8545 on 1 and 566 DF, p-value: 0.3557
sj_units_in_structure_by_block %>%
ggplot(aes(
x = `percent 1 only`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of structures with only one unit",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Only 1 Unit Per Structure"
)
summary(lm(`% not completely at home` ~ `percent 1 only`, sj_units_in_structure_by_block))
##
## Call:
## lm(formula = `% not completely at home` ~ `percent 1 only`, data = sj_units_in_structure_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.452 -5.117 -0.322 4.364 38.053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.38663 0.88348 62.692 < 2e-16 ***
## `percent 1 only` -0.05596 0.01125 -4.975 8.68e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.184 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.04189, Adjusted R-squared: 0.0402
## F-statistic: 24.75 on 1 and 566 DF, p-value: 8.681e-07
Household type and size:
# load data on household type and size
sj_house_size_type_by_block <- getCensus(
name = "acs/acs5",
vintage = 2018,
region = "block group:*",
regionin = "state:06+county:085",
vars = "group(B11016)"
) %>%
mutate(
blockgroup =
paste0(state,county,tract,block_group)
) %>%
select_if(!names(.) %in% c("GEO_ID","state","county","tract","block_group","NAME")) %>%
select(-c(contains("EA"),contains("MA"),contains("M"))) %>%
gather(key = "variable", value = "estimate", -blockgroup) %>%
mutate(label = acs_vars$label[match(variable,acs_vars$name)]) %>%
select(-variable) %>%
separate(label, into = c(NA, NA, "type", "size"), sep = "!!") %>%
filter(!is.na(type))
# household type
sj_house_type_by_block <- sj_house_size_type_by_block %>%
filter(is.na(size)) %>%
dplyr::select(-size) %>%
spread(key = type, value = estimate) %>%
mutate(`total households` = `Family households` + `Nonfamily households`, `percent nonfamily` = `Nonfamily households` / `total households`) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
sj_house_type_by_block %>%
ggplot(aes(
x = `percent nonfamily`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent nonfamily households",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Household Type"
)
summary(lm(`% not completely at home` ~ `percent nonfamily`, sj_house_type_by_block))
##
## Call:
## lm(formula = `% not completely at home` ~ `percent nonfamily`,
## data = sj_house_type_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.616 -5.108 -0.157 4.261 39.714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.8864 0.6327 77.262 < 2e-16 ***
## `percent nonfamily` 10.1400 2.1962 4.617 4.82e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.208 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0363, Adjusted R-squared: 0.03459
## F-statistic: 21.32 on 1 and 566 DF, p-value: 4.82e-06
# household size
sj_house_size_by_block <- sj_house_size_type_by_block %>%
filter(!is.na(size)) %>%
dplyr::select(-type) %>%
group_by(blockgroup, size) %>%
summarize(`total of this size` = sum(estimate)) %>%
spread(key = size, value = `total of this size`) %>%
mutate(total_nums = `1-person household` + `2-person household` + `3-person household` + `4-person household` + `5-person household`+ `6-person household` + `7-or-more person household`, `percent 5 or more` = (`5-person household`+`6-person household` + `7-or-more person household`)* 100/ total_nums, `percent 1 or 2 only` = (`1-person household` + `2-person household`)*100/total_nums) %>%
left_join(sj_age_by_block %>% dplyr::select_if(!names(.) %in% c("elderly", "percent elderly", "less than 30", "percent less than 30"))) %>%
filter(!is.na(device_count))
sj_house_size_by_block %>%
ggplot(aes(
x = `percent 5 or more`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with 5 or more people",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Households With 5 or More"
)
summary(lm(`% not completely at home` ~ `percent 5 or more`, sj_house_size_by_block))
##
## Call:
## lm(formula = `% not completely at home` ~ `percent 5 or more`,
## data = sj_house_size_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.136 -4.979 -0.518 4.175 36.154
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.85951 0.56004 89.028 < 2e-16 ***
## `percent 5 or more` 0.08454 0.02513 3.364 0.000821 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.278 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0196, Adjusted R-squared: 0.01786
## F-statistic: 11.31 on 1 and 566 DF, p-value: 0.0008215
sj_house_size_by_block %>%
ggplot(aes(
x = `percent 1 or 2 only`,
y = `% not completely at home`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of households with 1 or 2 people",
y = "Percent devices leaving home on weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Small Household Size"
)
summary(lm(`% not completely at home` ~ `percent 1 or 2 only`, sj_house_size_by_block))
##
## Call:
## lm(formula = `% not completely at home` ~ `percent 1 or 2 only`,
## data = sj_house_size_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.847 -5.127 -0.450 4.596 37.830
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.13694 0.96771 51.81 <2e-16 ***
## `percent 1 or 2 only` 0.02678 0.02013 1.33 0.184
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.348 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.003118, Adjusted R-squared: 0.001356
## F-statistic: 1.77 on 1 and 566 DF, p-value: 0.1839
Next I consider different ways of looking at the social distancing data. First I try distance traveled.
# try other ways of looking at the social distancing data
# first look at total distance traveled
sj_sd_distance <- sj_socialdistancing %>%
filter(date > shelter_start) %>%
group_by(origin_census_block_group) %>%
summarize(total_dist_traveled = sum(distance_traveled_from_home), device_count = sum(device_count)) %>%
mutate(total_dist_per_device = total_dist_traveled / device_count)
sj_distance_testing <- left_join(sj_ami_by_block, sj_sd_distance, by = c("blockgroup" = "origin_census_block_group")) %>% left_join(sj_age_by_block %>% select(blockgroup, `percent less than 30`))
sj_distance_testing %>% filter(total_dist_per_device < 500) %>%
ggplot(aes(
x = `% over 75,000`,
y = total_dist_per_device
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
y = "Average distance traveled per device during weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Income, Distance Metric"
)
This is very skewed by outliers, and probably not a useful metric.
Now I consider including devices that traveled <1km as staying at (or near) home.
sj_sd_range <- sj_socialdistancing %>%
filter(weekend == F) %>%
filter(date > shelter_start) %>%
mutate(travel_buckets_split = lapply(bucketed_distance_traveled, function(x) strsplit(x, "<1000")[[1]][2]), less_than_1km = lapply(travel_buckets_split, function(x) strsplit(x, ":")[[1]][2]), less_than_1km = lapply(less_than_1km, function(x) strsplit(x, ",")[[1]][1])) %>%
mutate(less_than_1km = lapply(less_than_1km, function(x) str_remove(x, "[}]"))) %>% # clean a bit more
mutate(less_than_1km = as.numeric(less_than_1km), less_than_1km = replace_na(less_than_1km, 0)) %>%
mutate(home_or_1km = completely_home_device_count + less_than_1km) %>%
group_by(origin_census_block_group) %>%
summarize(home_or_1km = sum(home_or_1km), device_count = sum(device_count)) %>%
mutate(`% Within 1km of Home` = (home_or_1km/device_count*100) %>% round(1), `% farther than 1km` = (100-`% Within 1km of Home`))
# join this with other data
sj_1km_testing <- left_join(sj_ami_by_block, sj_sd_range, by = c("blockgroup" = "origin_census_block_group")) %>%
left_join(sj_occupants_per_room_by_block %>% dplyr::select(`percent less than 1`, blockgroup)) %>%
left_join(sj_age_by_block %>% dplyr::select(`percent less than 30`, blockgroup)) %>%
left_join(sj_lang_by_block %>% dplyr::select(`% speaking english > well`, blockgroup))
# plot with income
sj_1km_testing %>%
ggplot(aes(
x = `% over 75,000`,
y = `% farther than 1km`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
y = "% of devices going farther than 1km of home, weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Income, 1km Range"
)
summary(lm(`% farther than 1km` ~ `% over 75,000`, sj_1km_testing))
##
## Call:
## lm(formula = `% farther than 1km` ~ `% over 75,000`, data = sj_1km_testing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.389 -4.795 -0.606 4.185 34.925
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.35928 1.11518 57.71 <2e-16 ***
## `% over 75,000` -0.20909 0.01719 -12.16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.444 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2072, Adjusted R-squared: 0.2058
## F-statistic: 148 on 1 and 566 DF, p-value: < 2.2e-16
# plot with age
sj_1km_testing %>%
ggplot(aes(
x = `percent less than 30`,
y = `% farther than 1km`
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people younger than 30",
y = "Percent of devices farther than 1km of home during weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Age, 1km Range"
)
summary(lm(`% farther than 1km` ~ `percent less than 30`, sj_1km_testing))
##
## Call:
## lm(formula = `% farther than 1km` ~ `percent less than 30`, data = sj_1km_testing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28.505 -4.978 -0.362 4.274 39.808
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.49223 1.51348 29.397 < 2e-16 ***
## `percent less than 30` 0.17973 0.03836 4.685 3.51e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.299 on 567 degrees of freedom
## Multiple R-squared: 0.03726, Adjusted R-squared: 0.03557
## F-statistic: 21.95 on 1 and 567 DF, p-value: 3.513e-06
# run multiple regression model
modeltest2 <- lm(sj_1km_testing$`% farther than 1km` ~ sj_1km_testing$`% over 75,000` + sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english > well` + sj_1km_testing$`percent less than 1`)
summary(modeltest2)
##
## Call:
## lm(formula = sj_1km_testing$`% farther than 1km` ~ sj_1km_testing$`% over 75,000` +
## sj_1km_testing$`percent less than 30` + sj_1km_testing$`% speaking english > well` +
## sj_1km_testing$`percent less than 1`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.646 -4.675 -0.762 4.296 34.809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 58.57603 4.69256 12.483 <2e-16
## sj_1km_testing$`% over 75,000` -0.21745 0.02166 -10.039 <2e-16
## sj_1km_testing$`percent less than 30` 0.03813 0.04267 0.894 0.3718
## sj_1km_testing$`% speaking english > well` 0.09468 0.04607 2.055 0.0403
## sj_1km_testing$`percent less than 1` -0.03946 0.04679 -0.843 0.3994
##
## (Intercept) ***
## sj_1km_testing$`% over 75,000` ***
## sj_1km_testing$`percent less than 30`
## sj_1km_testing$`% speaking english > well` *
## sj_1km_testing$`percent less than 1`
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.424 on 563 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2158, Adjusted R-squared: 0.2102
## F-statistic: 38.72 on 4 and 563 DF, p-value: < 2.2e-16
It looks like the fit of these selected variables is slightly better for the social distancing data based on not traveling farther than 1km.
Now I also consider “non-work” behavior.
sj_nonworking_by_block <- sj_socialdistancing %>%
filter(weekend == F) %>%
filter(date > shelter_start) %>%
mutate(nonworking = device_count - completely_home_device_count - part_time_work_behavior_devices - full_time_work_behavior_devices) %>%
group_by(origin_census_block_group) %>%
summarize(nonworking_count = sum(nonworking), total_device = sum(device_count)) %>%
mutate(nonworking_percent = nonworking_count*100 / total_device, percent_only_work_home = 100-nonworking_percent) %>%
left_join(sj_1km_testing %>% dplyr::select(`% over 75,000`, `percent less than 30`, `% speaking english > well`, `percent less than 1`, blockgroup), by = c("origin_census_block_group" = "blockgroup"))
# plot against age and income
sj_nonworking_by_block %>%
ggplot(aes(
x = `% over 75,000`,
y = nonworking_percent
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of housholds with incomes over $75,000 (50% AMI) annually",
y = "Percent of devices leaving home for non-work purposes during weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Income, Nonworking Behavior"
)
summary(lm(nonworking_percent ~ `% over 75,000`, sj_nonworking_by_block))
##
## Call:
## lm(formula = nonworking_percent ~ `% over 75,000`, data = sj_nonworking_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.922 -4.227 -0.758 3.670 33.628
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 52.99603 1.02994 51.46 <2e-16 ***
## `% over 75,000` -0.16521 0.01588 -10.41 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.875 on 566 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1606, Adjusted R-squared: 0.1591
## F-statistic: 108.3 on 1 and 566 DF, p-value: < 2.2e-16
sj_nonworking_by_block %>%
ggplot(aes(
x = `percent less than 30`,
y = nonworking_percent
)) + geom_point() +
geom_smooth(method=lm) +
labs(
x = "Percent of people younger than 30",
y = "Percent of devices leaving home for non-work purposes during weekdays since shelter-in-place",
title = "San Jose: Social Distancing and Age, Nonworking Behavior"
)
summary(lm(nonworking_percent ~ `percent less than 30`, sj_nonworking_by_block))
##
## Call:
## lm(formula = nonworking_percent ~ `percent less than 30`, data = sj_nonworking_by_block)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.773 -4.144 -0.345 3.400 32.456
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.85635 1.33549 26.849 < 2e-16 ***
## `percent less than 30` 0.17865 0.03385 5.277 1.87e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.323 on 567 degrees of freedom
## Multiple R-squared: 0.04682, Adjusted R-squared: 0.04513
## F-statistic: 27.85 on 1 and 567 DF, p-value: 1.871e-07
# multiple regression model
modeltest3 <- lm(sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% over 75,000` + sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english > well` + sj_nonworking_by_block$`percent less than 1`)
summary(modeltest3)
##
## Call:
## lm(formula = sj_nonworking_by_block$nonworking_percent ~ sj_nonworking_by_block$`% over 75,000` +
## sj_nonworking_by_block$`percent less than 30` + sj_nonworking_by_block$`% speaking english > well` +
## sj_nonworking_by_block$`percent less than 1`)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.758 -4.001 -0.742 3.843 31.685
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 44.10456 4.31944 10.211
## sj_nonworking_by_block$`% over 75,000` -0.17101 0.01994 -8.577
## sj_nonworking_by_block$`percent less than 30` 0.08287 0.03928 2.110
## sj_nonworking_by_block$`% speaking english > well` 0.07965 0.04241 1.878
## sj_nonworking_by_block$`percent less than 1` -0.01102 0.04307 -0.256
## Pr(>|t|)
## (Intercept) <2e-16 ***
## sj_nonworking_by_block$`% over 75,000` <2e-16 ***
## sj_nonworking_by_block$`percent less than 30` 0.0353 *
## sj_nonworking_by_block$`% speaking english > well` 0.0609 .
## sj_nonworking_by_block$`percent less than 1` 0.7981
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.833 on 563 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1752, Adjusted R-squared: 0.1693
## F-statistic: 29.89 on 4 and 563 DF, p-value: < 2.2e-16
These variables do worse for the percent nonworking metric, which makes sense.